# Load required library
library(jsonlite)
# Defining function
convert_to_csv <- function(file_path){
jsonData <- read_json(path = file_path)
# print(jsonData)
# Initialize vectors and variables
balls <- 0
flag <- TRUE
batter_vector <- c()
baller_vector <- c()
balls_vector <- c()
runsperball_vector <- c()
extra_vector <- c()
country_vector <- c()
innings_vector <- c()
toss <- c()
toss_winner <- c()
innings <- c(1, 2)
toss_decision <- c()
match_winner <- c()
margin <- c()
total_score <- c()
wickets <- c()
cum_wickets <- 0
df <- data.frame(
baller = baller_vector,
batter = batter_vector,
balls = balls_vector,
runsperball = runsperball_vector,
extra = extra_vector,
total_score = total_score,
wickets = wickets,
country = country_vector,
innings = innings_vector,
toss_decision = toss_decision,
toss_winner = match_winner,
margin = margin,
match_winner = match_winner
)
# Return empty dataframe if result is available
if (!is.null(jsonData$info$outcome$result)) {
return(df)
}
# Loop through each innings
for (inn in innings) {
cummulative_score <- 0
flag <- TRUE
balls <- 0
cumm_wickets <- 0
# Loop through overs
while (balls <= 300 && flag) {
tryCatch({
over_index <- balls %/% 6 + 1
if (over_index > length(jsonData$innings[[inn]]$overs)) {
flag <- FALSE
break
}
# Initialize variables for each over
over_balls <- length(jsonData$innings[[inn]]$overs[[over_index]]$deliveries)
currball <- 1
extras_count <- 0
# Loop through each ball
while (currball <= over_balls) {
ball_index <- (balls %/% 6) * 6 + currball
toss <- c(toss, jsonData$info$toss$winner)
match_winner <- c(match_winner, jsonData$info$outcome$winner)
batter <- jsonData$innings[[inn]]$overs[[over_index]]$deliveries[[currball]]$batter
bowler <- jsonData$innings[[inn]]$overs[[over_index]]$deliveries[[currball]]$bowler
runs <- jsonData$innings[[inn]]$overs[[over_index]]$deliveries[[currball]]$runs$batter
team <- jsonData$innings[[inn]]$team
batter_vector <- c(batter_vector, batter)
baller_vector <- c(baller_vector, bowler)
balls_vector <- c(balls_vector, balls %/% 6 + 0.1 * (currball - extras_count))
# Update cumulative score
cummulative_score <- cummulative_score + jsonData$innings[[inn]]$overs[[over_index]]$deliveries[[currball]]$runs$total
# Update vectors
runsperball_vector <- c(runsperball_vector, runs)
country_vector <- c(country_vector, team)
innings_vector <- c(innings_vector, inn)
toss_winner <- c(toss_winner, toss)
# Update toss decision
toss_decision <- c(toss_decision, jsonData$info$toss$decision)
# Check for extras
if (!is.null(jsonData$innings[[inn]]$overs[[over_index]]$deliveries[[currball]]$extras)) {
if (is.null(jsonData$innings[[inn]]$overs[[over_index]]$deliveries[[currball]]$extras$legbyes)) {
extras_count <- extras_count + 1
}
extra_vector <- c(extra_vector, TRUE)
} else {
extra_vector <- c(extra_vector, FALSE)
}
# Update match outcome
if (!is.null(jsonData$info$outcome$by$wickets)) {
margin <- c(margin, paste(jsonData$info$outcome$by$wickets, " wickets"))
} else {
margin <- c(margin, paste(jsonData$info$outcome$by$wickets, " runs"))
}
total_score <- c(total_score, cummulative_score)
# Update wickets
if (!is.null(jsonData$innings[[inn]]$overs[[over_index]]$deliveries[[currball]]$wickets)) {
cum_wickets <- cum_wickets + 1
wickets <- c(wickets, cum_wickets)
} else {
wickets <- c(wickets, 0)
}
currball <- currball + 1
}
balls <- balls + 6
}, error = function(e) {
flag <- FALSE
})
}
}
# Create dataframe
df <- data.frame(
baller = baller_vector,
batter = batter_vector,
balls = balls_vector,
runsperball = runsperball_vector,
extra = extra_vector,
total_score = total_score,
wickets = wickets,
country = country_vector,
innings = innings_vector,
toss_decision = toss_decision,
toss_winner = match_winner,
margin = margin,
match_winner = match_winner
)
return(df)
}
library(ggplot2)
library(gridExtra)
library(dplyr)
data_path <- "D:/Minor_Project-II/Data/IND vs AUS/ODI/BowlerStats.csv"
data <- read.csv(data_path)
# Initialize empty data frames for storing the data for different countries
aus_players_economy <- data.frame(economy = numeric())
aus_players_bowling_avg <- data.frame(bowling_avg = numeric())
ind_players_economy <- data.frame(economy = numeric())
ind_players_bowling_avg <- data.frame(bowling_avg = numeric())
# Separate data for each country
for (i in 1:nrow(data)) {
if (data$country[i] == "Australia") {
aus_players_economy <- rbind(aus_players_economy, data.frame(economy = data$economy[i]))
aus_players_bowling_avg <- rbind(aus_players_bowling_avg, data.frame(bowling_avg = data$bowling_average[i]))
} else {
ind_players_economy <- rbind(ind_players_economy, data.frame(economy = data$economy[i]))
ind_players_bowling_avg <- rbind(ind_players_bowling_avg, data.frame(bowling_avg = data$bowling_average[i]))
}
}
# Add group labels
aus_players_economy$Group <- "Australia"
ind_players_economy$Group <- "India"
combined_economy_data <- rbind(aus_players_economy, ind_players_economy)
aus_players_bowling_avg$Group <- "Australia"
ind_players_bowling_avg$Group <- "India"
combined_bowling_avg_data <- rbind(aus_players_bowling_avg, ind_players_bowling_avg)
# KDE plot of economies
economy_plot <- ggplot(combined_economy_data, aes(x = economy, fill = Group, color = Group)) +
geom_density(alpha = 0.7) +
labs(title = "Kernel Density Estimation Plot of Economies",
x = "Economy",
y = "Density") +
scale_fill_manual(values = c("Australia" = "yellow", "India" = "blue")) +
scale_color_manual(values = c("Australia" = "yellow", "India" = "blue")) +
theme_minimal()
# KDE plot of bowling averages
bowling_avg_plot <- ggplot(combined_bowling_avg_data, aes(x = bowling_avg, fill = Group, color = Group)) +
geom_density(alpha = 0.7) +
labs(title = "Kernel Density Estimation Plot of Bowling Averages",
x = "Bowling Average",
y = "Density") +
scale_fill_manual(values = c("Australia" = "yellow", "India" = "blue")) +
scale_color_manual(values = c("Australia" = "yellow", "India" = "blue")) +
theme_minimal()
# Load additional data
strike_rate_path <- "D:/Minor_Project-II/Data/IND vs AUS/ODI/BatsmanScores.csv"
data_strike_rate <- read.csv(strike_rate_path)
aus_players_strike_rate <- data.frame(strike_rate = numeric())
aus_players_batting_avg <- data.frame(batting_avg = numeric())
ind_players_strike_rate <- data.frame(strike_rate = numeric())
ind_players_batting_avg <- data.frame(batting_avg = numeric())
for (i in 1:nrow(data_strike_rate)) {
if (data_strike_rate$country[i] == "Australia") {
aus_players_strike_rate <- rbind(aus_players_strike_rate, data.frame(strike_rate = data_strike_rate$strike_rate[i]))
aus_players_batting_avg <- rbind(aus_players_batting_avg, data.frame(batting_avg = data_strike_rate$batting_average[i]))
} else {
ind_players_strike_rate <- rbind(ind_players_strike_rate, data.frame(strike_rate = data_strike_rate$strike_rate[i]))
ind_players_batting_avg <- rbind(ind_players_batting_avg, data.frame(batting_avg = data_strike_rate$batting_average[i]))
}
}
# Add group labels
aus_players_strike_rate$Group <- "Australia"
ind_players_strike_rate$Group <- "India"
combined_strike_rate_data <- rbind(aus_players_strike_rate, ind_players_strike_rate)
aus_players_batting_avg$Group <- "Australia"
ind_players_batting_avg$Group <- "India"
combined_batting_avg_data <- rbind(aus_players_batting_avg, ind_players_batting_avg)
# KDE plot of strike rates
strike_rate_plot <- ggplot(combined_strike_rate_data, aes(x = strike_rate, fill = Group, color = Group)) +
geom_density(alpha = 0.7) +
labs(title = "Kernel Density Estimation Plot of Strike Rates",
x = "Strike Rate",
y = "Density") +
scale_fill_manual(values = c("Australia" = "yellow", "India" = "blue")) +
scale_color_manual(values = c("Australia" = "yellow", "India" = "blue")) +
theme_minimal()
# KDE plot of batting averages
batting_avg_plot <- ggplot(combined_batting_avg_data, aes(x = batting_avg, fill = Group, color = Group)) +
geom_density(alpha = 0.7) +
labs(title = "Kernel Density Estimation Plot of Batting Averages",
x = "Batting Average",
y = "Density") +
scale_fill_manual(values = c("Australia" = "yellow", "India" = "blue")) +
scale_color_manual(values = c("Australia" = "yellow", "India" = "blue")) +
theme_minimal()
# Combine the plots into a 2x2 grid layout
print(economy_plot)
print(bowling_avg_plot)
print(strike_rate_plot)
print(batting_avg_plot)
# Load necessary libraries
library(ggplot2)
library(plotly)
# Load the data
data_path <- "D:/Minor_Project-II/Data/IND vs AUS/ODI/BowlerStats.csv"
data <- read.csv(data_path)
# Create the ggplot object
p <- ggplot(data, aes(x = player, y = wickets, fill = country)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Wickets Taken by Player and Country", x = "Player", y = "Wickets") +
scale_fill_manual(values = c("Australia" = "yellow", "India" = "blue")) +
theme_minimal() +
theme(axis.text.x = element_blank()) # Hide x-axis labels
# Convert the ggplot object to a plotly object
p_plotly <- ggplotly(p, tooltip = c("player", "wickets"))
# Print the plotly object
p_plotly
# Create the bubble chart
bubble_chart <- plot_ly(data, x = ~score, y = ~wickets, size = ~economy,
color = ~country, text = ~player, hoverinfo = "text",
mode = "markers", type = "scatter") %>%
layout(title = "Bubble Chart of Bowler Performance Metrics",
xaxis = list(title = "Runs Conceded"),
yaxis = list(title = "Wickets"),
showlegend = TRUE)
# Print the bubble chart
bubble_chart
# Calculate error metrics for each model
df <- read.csv("D:/Minor_Project-II/Data/Combined.csv")
# Select the features for the model and preprocess the data
selected_features <- c("batter_style", "wickets", "bowler_style")
x <- df[selected_features]
# Convert categorical variables to binary encoding
x$batter_style <- ifelse(x$batter_style == "right", 0, 1)
x$bowler_style <- ifelse(x$bowler_style == "fast", 0, 1)
# Define the target variable (response)
y <- df$balls
stack_model=readRDS("D:/Minor_Project-II/Code/Model/hybrid_wicket_trained_model.rds")
models=readRDS("D:/Minor_Project-II/Code/Model/models.rds")
predictions <- suppressWarnings({
data.frame(
linear_reg = predict(models$linear_reg, newdata = x),
rf_model = predict(models$rf_model, newdata = x),
xgboost = predict(models$xgboost, newdata = x, iteration_range = c(1, models$xgboost$bestTune$nrounds))
)
})
stacked_predictions <- predict(stack_model, newdata = predictions)
calculate_metrics <- function(actual, predicted) {
mae <- mean(abs(actual - predicted))
rmse <- sqrt(mean((actual - predicted)^2))
mse <- mean((actual - predicted)^2)
rss <- sum((actual - predicted)^2)
tss <- sum((actual - mean(actual))^2)
r_squared <- 1 - rss / tss
return(c(MAE = mae, RMSE = rmse, MSE = mse, R_squared = r_squared))
}
metrics <- data.frame(
Model = character(),
MAE = numeric(),
RMSE = numeric(),
MSE = numeric(),
R_squared = numeric(),
stringsAsFactors = FALSE
)
for (model in colnames(predictions)) {
model_metrics <- calculate_metrics(y, predictions[[model]])
metrics <- rbind(metrics, data.frame(Model = model, t(model_metrics)))
}
stacked_metrics <- calculate_metrics(y, stacked_predictions)
metrics <- rbind(metrics, data.frame(Model = "stacked", t(stacked_metrics)))
# Print the metrics
print(metrics)
## Model MAE RMSE MSE R_squared
## 1 linear_reg 12.32996 14.27544 203.7882 0.00993324
## 2 rf_model 12.30647 14.24997 203.0616 0.01346291
## 3 xgboost 12.29896 14.24703 202.9779 0.01386975
## 4 stacked 12.30691 14.25022 203.0689 0.01342770
# Plot error metrics comparison
metrics_long <- reshape2::melt(metrics, id.vars = "Model")
ggplot(metrics_long, aes(x = Model, y = value, fill = Model)) +
geom_bar(stat = "identity", position = "dodge") +
facet_wrap(~variable, scales = "free_y") +
theme_minimal() +
ggtitle("Error Metrics Comparison") +
ylab("Value") +
xlab("Model") +
theme(legend.position = "none")
# Plot Actual vs Predicted for each model
for (model in colnames(predictions)) {
p <- ggplot(data.frame(Actual = y, Predicted = predictions[[model]]), aes(x = Actual, y = Predicted)) +
geom_point(color = "blue", alpha = 0.6) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
ggtitle(paste("Actual vs Predicted -", model)) +
xlab("Actual Values") +
ylab("Predicted Values") +
theme_minimal()
print(p)
}
# Plot Residuals for each model
for (model in colnames(predictions)) {
p <- ggplot(data.frame(Predicted = predictions[[model]], Errors = y - predictions[[model]]), aes(x = Predicted, y = Errors)) +
geom_point(color = "blue", alpha = 0.6) +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
ggtitle(paste("Residuals vs Predicted -", model)) +
xlab("Predicted Values") +
ylab("Residuals (Errors)") +
theme_minimal()
print(p)
}
# Plot error distribution for each model
for (model in colnames(predictions)) {
p <- ggplot(data.frame(Errors = y - predictions[[model]]), aes(x = Errors)) +
geom_histogram(binwidth = 1, fill = "blue", color = "black", alpha = 0.6) +
ggtitle(paste("Distribution of Residuals -", model)) +
xlab("Residuals (Errors)") +
ylab("Frequency") +
theme_minimal()
print(p)
}
# Save the stacked model for future use
saveRDS(stack_model, "D:/Minor_Project-II/Code/Model/hybrid_wicket_trained_model.rds")
OverBYOver<- function(ball_by_ball,filename){
library(dplyr)
library(tidyr)
library(zoo)
# ball_by_ball <- read.csv("D:/Minor_Project-II/Data/Datasets/65244.csv")
# # print(ball_by_ball[1,])
if(nrow(ball_by_ball)==0){
return(data.frame())
}
over_by_over <- ball_by_ball %>%
group_by(innings, over = floor(balls)) %>%
summarise(
runs_in_over = sum(runsperball),
wickets_in_over = ifelse(sum(wickets)>10,sum(wickets)%%10,sum(wickets)),
run_rate = sum(runsperball) / 6,
score = max(total_score),
)
# over_by_over <- read.csv("D:/Minor_Project-II/Code/over_by_over.csv")
over_by_over <- over_by_over %>%
group_by(innings) %>%
mutate(
required_run_rate = ifelse(innings == 1, 0, (max(score) - score) / (50 - over))
)
over_by_over <- over_by_over %>%
group_by(innings) %>%
mutate(
runs_in_previous_3_overs = rollapplyr(runs_in_over, width = 3, FUN = sum, fill = 0),
)
# print(over_by_over)
# write.csv(over_by_over, "D:/Minor_Project-II/Code/over_by_over.csv", row.names = FALSE)
# write.csv(over_by_over, paste0("D:/Minor_Project-II/Data/Datasets/OverByOver/", filename), row.names = FALSE)
return(over_by_over)
}
names = c()
list = list.files("D:/Minor_Project-II/Data/Datasets/BallByBall")
library(jsonlite)
base_dir <- "D:/Minor_Project-II/Data/IND vs AUS/ODI"
year_folders <- list.files(base_dir)
for (year in year_folders){
json_files <- list.files(paste(base_dir, year, sep = "/"))
for (json_file in json_files){
len = nchar(json_file)
# print(substr(json_file,1,len-5))
names = c(names,paste0(substr(json_file,1,len-5),".csv"))
}
}
# print(length(names))
names = intersect(names,list)
# print(length(names))
# print(list)
# print(names)
aggregated_over_by_over <- data.frame()
for(i in 1:length(names)){
x = read.csv(paste0("D:/Minor_Project-II/Data/Datasets/BallByBall/",names[i]))
df = OverBYOver(x,names[i])
aggregated_over_by_over <- rbind(aggregated_over_by_over,df)
}
write.csv(aggregated_over_by_over, "D:/Minor_Project-II/Data/Datasets/OverByOver/Aggregated.csv", row.names = FALSE)
This cell contains the code for building the model to predict score of a team based on the match situation where the features include innings, over, runs_in_the_over, wickets, wickets_in_the_over, runs_in_previous_3_overs, run_rate, required_run_rate.
# Load necessary libraries
library(tidyr)
library(dplyr)
library(caret)
## Loading required package: lattice
library(ggplot2)
library(ggpubr)
library(pdp)
suppressWarnings({
# Read the aggregated CSV file
data <- read.csv("D:/Minor_Project-II/Data/Datasets/OverByOver/Aggregated.csv")
# Handle missing values
data$required_run_rate[is.na(data$required_run_rate)] <- 0 # Substitute NA with 0 in required run rate
# Split the data into training and testing sets
set.seed(123) # For reproducibility
train_index <- createDataPartition(data$score, p = 0.8, list = FALSE)
train_data <- data[train_index, ]
test_data <- data[-train_index, ]
# Define features and target variable
features <- c("over", "wickets_in_over",
"runs_in_over", "run_rate", "required_run_rate", "runs_in_previous_3_overs", "innings")
target <- "score"
# Check if the columns exist in the training data
if (all(features %in% colnames(train_data)) && target %in% colnames(train_data)) {
# Train the model
model <- train(
score ~ .,
data = train_data[, c(features, target)],
method = "lm" # Linear regression
)
# Evaluate the model
predictions <- predict(model, newdata = test_data[, features])
mae <- mean(abs(predictions - test_data$score))/100
mse <- mean((predictions - test_data$score)^2)/100
rmse <- sqrt(mse)
# Print evaluation metrics
cat("Mean Absolute Error (MAE):", mae, "\n")
cat("Mean Squared Error (MSE):", mse, "\n")
cat("Root Mean Squared Error (RMSE):", rmse, "\n")
# Actual vs. Predicted Plot
actual_vs_predicted <- ggplot() +
geom_point(aes(x = test_data$score, y = predictions)) +
geom_abline(intercept = 0, slope = 1, color = "red") +
labs(x = "Actual Score", y = "Predicted Score", title = "Actual vs. Predicted Plot")
# Distribution of Residuals
residuals_distribution <- ggplot() +
geom_histogram(aes(x = residuals(model), fill = "Residuals"), bins = 30, alpha = 0.7) +
labs(x = "Residuals", y = "Frequency", title = "Distribution of Residuals")
# Feature Importance Plot
feature_importance <- varImp(model)$importance
feature_importance_plot <- ggplot(as.data.frame(feature_importance), aes(x = reorder(rownames(feature_importance), Overall), y = Overall)) +
geom_bar(stat = "identity", fill = "skyblue") +
coord_flip() +
labs(x = "Features", y = "Importance", title = "Feature Importance Plot")
# Calculate RMSE for different training set sizes
train_sizes <- seq(0.1, 0.9, by = 0.1)
rmse_values <- sapply(train_sizes, function(size) {
train_index <- sample(1:nrow(train_data), size = size * nrow(train_data))
model <- train(
score ~ .,
data = train_data[train_index, c(features, target)],
method = "lm"
)
predictions <- predict(model, newdata = test_data[, features])
rmse <- sqrt(mean((predictions - test_data$score)^2))
return(rmse)
})
# Learning Curve
learning_curve <- ggplot() +
geom_line(aes(x = train_sizes, y = rmse_values)) +
labs(x = "Training Set Size", y = "RMSE", title = "Learning Curve")
# Partial Dependence Plots (PDP)
partial_dependence_plots <- list()
for (feature in features) {
pdp_data <- partial(model, pred.var = feature)
pdp_plot <- autoplot(pdp_data, feature = feature) +
labs(title = paste("Partial Dependence Plot for", feature))
partial_dependence_plots[[feature]] <- pdp_plot
}
# Residuals vs. Features Plots
residuals_vs_features_plots <- list()
for (feature in setdiff(features, "wickets_in_prev_3_overs")) {
residuals_vs_feature_plot <- ggplot(train_data, aes_string(x = feature, y = residuals(model))) +
geom_point() +
geom_smooth() +
labs(x = feature, y = "Residuals", title = paste("Residuals vs.", feature))
residuals_vs_features_plots[[feature]] <- residuals_vs_feature_plot
}
# Combine all plots in a single canvas
combined_plot <- ggarrange(actual_vs_predicted, residuals_distribution, feature_importance_plot, learning_curve,
ncol = 2, nrow = 2, labels = c("A", "B", "C", "D"), common.legend = TRUE)
# Print the combined plot
print(combined_plot)
# Save the combined plot to a file
# ggsave("combined_plot.png", combined_plot, width = 15, height = 12, units = "cm")
} else {
cat("Some of the specified columns do not exist in the training data.")
}
})
## Mean Absolute Error (MAE): 0.1710224
## Mean Squared Error (MSE): 5.385725
## Root Mean Squared Error (RMSE): 2.320717